package com.ewei.web.crawler.utils;

import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jsoup.Connection.Response;

import com.ewei.web.crawler.domain.URLs;

/**
 * 正则表达式容器
 * @author David
 *
 */
public class RegxContext {
	
	public static final String topicUrl = "<a[\\s\\S]*?href=\"(/topic/[^//]*)[^>]*>([^<]*)</a>";
	
	public static final String quUrl = "";
	
	public static List<URLs> crawlDatas(String regx,Response response){
		return crawlDatas(regx, response.body());
	}
    public static List<URLs> crawlDatas(String regx,String htmlBody){
		Pattern pattern = Pattern.compile(regx);
		Matcher matcher = pattern.matcher(htmlBody);
		List<URLs> datas = new ArrayList<URLs>();
		while(matcher.find()){
			datas.add(new URLs(matcher.group(1), matcher.group(2).trim()));
		}
		return datas;
	}
    public static void main(String[] args) throws Exception {
//		String str = "<div><a class=\"zm-item-tag\" href=\"/topic/19622435\" data-tip=\"t$b$19622435\" " +
//				"data-token=\"19622435\" data-topicid=\"24147\">知乎规范</a>"+
//		"<a class=\"zm-item-tag\" href=\"/topic/121212\" data-tip=\"t$b$19622435\" " +
//		"data-token=\"19622435\" data-topicid=\"24147\">知乎产品</a></div>";
    	Scanner scanner = new Scanner(System.in);
    	String str = scanner.next();
    	str = new String(str.getBytes("GBK"),"UTF-8");
    	System.out.println(str);
//		String test = "<a[\\s\\S]*href=\"(/topic/[^\"]*)";
		Pattern pattern = Pattern.compile(topicUrl);
		Matcher matcher = pattern.matcher(str);
		List<String> datas = new ArrayList<String>();
		while(matcher.find()){
//			System.out.println(str);
			System.out.println(matcher.group(1)+"   "+matcher.group(2));
		}
	}
}
